#爬虫  爬取的第二本小说
import requests
import re
#下载一个网页
url = 'http://www.jingcaiyuedu.com/book/15401.html'
#模拟浏览器发送HTTP请求
response = requests.get(url)
#编码方式
response.encoding = 'utf-8'
#目标小说的网页代码
html = response.text
#小说的名称
title = re.findall(r'<meta property="og:title" content="(.*?)"/>',html)[0]
print(title)
#获取每一章节的信息
dl = re.findall(r'<dd class="col-md-3">.*?</dd>',html,re.S)

chapter_info_list = re.findall(r'<a href="(.*?)">(.*?)</a>',str(dl),re.S)
print(chapter_info_list)
#新建一个文件，用来保存小说内容
fb = open('%s.txt'%title,'w',encoding='utf-8')
#循环每一个章节
for chapter_info in chapter_info_list:
    chapter_url,chapter_title = chapter_info
    #拼接
    chapter_url ="http://www.jingcaiyuedu.com%s" % chapter_url
    print(chapter_url,chapter_title)

    #下载章节内容
    chapter_response = requests.get(chapter_url)
    chapter_response.encoding = 'utf-8'
    chapter_html = chapter_response.text
    #提取章节内容
    chapter_content = re.findall(r'<div class="panel-body" id="htmlContent">(.*?)</div>',chapter_html,re.S)[0]
    #清洗数据
    chapter_content = chapter_content.replace('<br />','')
    chapter_content = chapter_content.replace('<br>', '')
    print(chapter_content)
    fb.write(chapter_title)
    fb.write(chapter_content)
    fb.write("\n")
